Machine Learning Models for Geographic and Remote Work Analysis

KMeans Clustering, Salary Prediction, and Remote Work Classification

Authors
Affiliation

Bingrui Qiao

Zhengyu Zhou

Junhao Wang

Boston University

1 Classification: Remote vs Non-Remote Jobs

# Import necessary libraries
import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Set plotly renderer
pio.renderers.default = "notebook+notebook_connected+vscode"

# Initialize Spark Session
df = pd.read_csv("data/lightcast_job_postings.csv", low_memory=False)

# Print schema and preview first few rows
print("--- Diagnostic check: Schema and sample rows ---")
print(df.info())
print(df.head())
--- Diagnostic check: Schema and sample rows ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72498 entries, 0 to 72497
Columns: 131 entries, ID to NAICS_2022_6_NAME
dtypes: float64(38), object(93)
memory usage: 72.5+ MB
None
                                         ID LAST_UPDATED_DATE  \
0  1f57d95acf4dc67ed2819eb12f049f6a5c11782c          9/6/2024   
1  0cb072af26757b6c4ea9464472a50a443af681ac          8/2/2024   
2  85318b12b3331fa490d32ad014379df01855c557          9/6/2024   
3  1b5c3941e54a1889ef4f8ae55b401a550708a310          9/6/2024   
4  cb5ca25f02bdf25c13edfede7931508bfd9e858f         6/19/2024   

      LAST_UPDATED_TIMESTAMP  DUPLICATES    POSTED    EXPIRED  DURATION  \
0  2024-09-06 20:32:57.352 Z         0.0  6/2/2024   6/8/2024       6.0   
1  2024-08-02 17:08:58.838 Z         0.0  6/2/2024   8/1/2024       NaN   
2  2024-09-06 20:32:57.352 Z         1.0  6/2/2024   7/7/2024      35.0   
3  2024-09-06 20:32:57.352 Z         1.0  6/2/2024  7/20/2024      48.0   
4  2024-06-19 07:00:00.000 Z         0.0  6/2/2024  6/17/2024      15.0   

             SOURCE_TYPES                                        SOURCES  \
0       [\n  "Company"\n]                        [\n  "brassring.com"\n]   
1     [\n  "Job Board"\n]                            [\n  "maine.gov"\n]   
2     [\n  "Job Board"\n]                           [\n  "dejobs.org"\n]   
3     [\n  "Job Board"\n]  [\n  "disabledperson.com",\n  "dejobs.org"\n]   
4  [\n  "FreeJobBoard"\n]                       [\n  "craigslist.org"\n]   

                                                 URL  ... NAICS_2022_2  \
0  [\n  "https://sjobs.brassring.com/TGnewUI/Sear...  ...         44.0   
1   [\n  "https://joblink.maine.gov/jobs/1085740"\n]  ...         56.0   
2  [\n  "https://dejobs.org/dallas-tx/data-analys...  ...         52.0   
3  [\n  "https://www.disabledperson.com/jobs/5948...  ...         52.0   
4  [\n  "https://modesto.craigslist.org/sls/77475...  ...         99.0   

                                   NAICS_2022_2_NAME NAICS_2022_3  \
0                                       Retail Trade        441.0   
1  Administrative and Support and Waste Managemen...        561.0   
2                              Finance and Insurance        524.0   
3                              Finance and Insurance        522.0   
4                              Unclassified Industry        999.0   

                              NAICS_2022_3_NAME NAICS_2022_4  \
0               Motor Vehicle and Parts Dealers       4413.0   
1           Administrative and Support Services       5613.0   
2     Insurance Carriers and Related Activities       5242.0   
3  Credit Intermediation and Related Activities       5221.0   
4                         Unclassified Industry       9999.0   

                                   NAICS_2022_4_NAME  NAICS_2022_5  \
0  Automotive Parts, Accessories, and Tire Retailers       44133.0   
1                                Employment Services       56132.0   
2  Agencies, Brokerages, and Other Insurance Rela...       52429.0   
3                   Depository Credit Intermediation       52211.0   
4                              Unclassified Industry       99999.0   

                            NAICS_2022_5_NAME NAICS_2022_6  \
0  Automotive Parts and Accessories Retailers     441330.0   
1                     Temporary Help Services     561320.0   
2          Other Insurance Related Activities     524291.0   
3                          Commercial Banking     522110.0   
4                       Unclassified Industry     999999.0   

                            NAICS_2022_6_NAME  
0  Automotive Parts and Accessories Retailers  
1                     Temporary Help Services  
2                            Claims Adjusting  
3                          Commercial Banking  
4                       Unclassified Industry  

[5 rows x 131 columns]
# Take subset of relevant columns
relevant_columns = [
    "SALARY", "MIN_YEARS_EXPERIENCE", "EDUCATION_LEVELS_NAME",
    "EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME", "DURATION", 
    "IS_INTERNSHIP", "COMPANY_IS_STAFFING", "STATE_NAME", "CITY_NAME",
    "MSA_NAME", "ONET", "ONET_NAME", "NAICS2_NAME", "TITLE_NAME"
]

df_analysis = df[relevant_columns].copy()

df_analysis.head()
SALARY MIN_YEARS_EXPERIENCE EDUCATION_LEVELS_NAME EMPLOYMENT_TYPE_NAME REMOTE_TYPE_NAME DURATION IS_INTERNSHIP COMPANY_IS_STAFFING STATE_NAME CITY_NAME MSA_NAME ONET ONET_NAME NAICS2_NAME TITLE_NAME
0 NaN 2.0 [\n "Bachelor's degree"\n] Full-time (> 32 hours) [None] 6.0 False False Arkansas El Dorado, AR El Dorado, AR 15-2051.01 Business Intelligence Analysts Retail Trade Enterprise Analysts
1 NaN 3.0 [\n "No Education Listed"\n] Full-time (> 32 hours) Remote NaN False True Maine Augusta, ME Augusta-Waterville, ME 15-2051.01 Business Intelligence Analysts Administrative and Support and Waste Managemen... Oracle Consultants
2 NaN 5.0 [\n "Bachelor's degree"\n] Full-time (> 32 hours) [None] 35.0 False False Texas Dallas, TX Dallas-Fort Worth-Arlington, TX 15-2051.01 Business Intelligence Analysts Finance and Insurance Data Analysts
3 NaN 3.0 [\n "No Education Listed"\n] Full-time (> 32 hours) [None] 48.0 False False Arizona Phoenix, AZ Phoenix-Mesa-Chandler, AZ 15-2051.01 Business Intelligence Analysts Finance and Insurance Management Analysts
4 92500.0 NaN [\n "No Education Listed"\n] Part-time / full-time [None] 15.0 False False California Modesto, CA Modesto, CA 15-2051.01 Business Intelligence Analysts Unclassified Industry Unclassified
df_analysis["REMOTE_TYPE_NAME"] = df_analysis["REMOTE_TYPE_NAME"].astype(str).str.strip()

remote_col = df_analysis["REMOTE_TYPE_NAME"].replace({"[None]": None})

df_analysis["REMOTE_GROUP"] = np.select(
    [
        remote_col.eq("Remote"),
        remote_col.eq("Hybrid Remote"),
        remote_col.eq("Not Remote"),
        remote_col.isna()
    ],
    ["Remote", "Hybrid", "Onsite", "Onsite"],
    default="Onsite"
)

df_analysis.drop(columns=["REMOTE_TYPE_NAME"], inplace=True)


# EMPLOYMENT_GROUP
emp_col = df_analysis["EMPLOYMENT_TYPE_NAME"].astype(str).str.strip()

df_analysis["EMPLOYMENT_GROUP"] = np.select(
    [
        emp_col.eq("Full-time (> 32 hours)"),
        emp_col.eq("Part-time (⤠32 hours)"),
        emp_col.eq("Part-time / full-time"),
        emp_col.isna()
    ],
    ["Full-time", "Part-time", "Flexible", "Full-time"],
    default="Flexible"
)

df_analysis.drop(columns=["EMPLOYMENT_TYPE_NAME"], inplace=True)

# MIN_YEARS_EXPERIENCE & group

df_analysis["MIN_YEARS_EXPERIENCE"] = df_analysis["MIN_YEARS_EXPERIENCE"].fillna(0)

# make sure it is numerical
df_analysis["MIN_YEARS_EXPERIENCE"] = pd.to_numeric(
    df_analysis["MIN_YEARS_EXPERIENCE"],
    errors="coerce"
).fillna(0)

exp = df_analysis["MIN_YEARS_EXPERIENCE"]

df_analysis["MIN_YEARS_EXPERIENCE_GROUP"] = np.select(
    [
        (exp >= 0) & (exp <= 1),
        (exp > 1) & (exp <= 3),
        (exp > 3) & (exp <= 5),
        (exp > 5) & (exp <= 10),
        (exp > 10)
    ],
    ["Internship/Entry Level", "Junior", "Mid-Level", "Senior", "Expert"],
    default="Internship/Entry Level"
)


# DURATION:null value & 0 -> 1

dur = df_analysis["DURATION"]
df_analysis["DURATION"] = (
    dur.fillna(1)
       .replace(0, 1)
)


# clean EDUCATION_LEVELS_NAME -> EDUCATION_LEVELS_CLEAN

edu_raw = df_analysis["EDUCATION_LEVELS_NAME"].fillna("")
edu_clean = (
    edu_raw
    .astype(str)
    .str.replace(r"[\[\]\n]", "", regex=True)
    .str.strip()
)
df_analysis["EDUCATION_LEVELS_CLEAN"] = edu_clean
df_analysis.drop(columns=["EDUCATION_LEVELS_NAME"], inplace=True)


# Fill in the blank STATE_NAME/CITY_NAME


df_analysis["STATE_NAME"] = df_analysis["STATE_NAME"].fillna("Unknown")
df_analysis["CITY_NAME"] = df_analysis["CITY_NAME"].fillna("Unknown")


# ONET/ONET_NAME/NAICS2_NAME null value handling

df_analysis["ONET"] = df_analysis["ONET"].fillna("00-0000.00")
df_analysis["ONET_NAME"] = df_analysis["ONET_NAME"].fillna("Unknown")
df_analysis["NAICS2_NAME"] = df_analysis["NAICS2_NAME"].fillna("Unknown")

print(df_analysis.head())
    SALARY  MIN_YEARS_EXPERIENCE  DURATION IS_INTERNSHIP COMPANY_IS_STAFFING  \
0      NaN                   2.0       6.0         False               False   
1      NaN                   3.0       1.0         False                True   
2      NaN                   5.0      35.0         False               False   
3      NaN                   3.0      48.0         False               False   
4  92500.0                   0.0      15.0         False               False   

   STATE_NAME      CITY_NAME                         MSA_NAME        ONET  \
0    Arkansas  El Dorado, AR                    El Dorado, AR  15-2051.01   
1       Maine    Augusta, ME           Augusta-Waterville, ME  15-2051.01   
2       Texas     Dallas, TX  Dallas-Fort Worth-Arlington, TX  15-2051.01   
3     Arizona    Phoenix, AZ        Phoenix-Mesa-Chandler, AZ  15-2051.01   
4  California    Modesto, CA                      Modesto, CA  15-2051.01   

                        ONET_NAME  \
0  Business Intelligence Analysts   
1  Business Intelligence Analysts   
2  Business Intelligence Analysts   
3  Business Intelligence Analysts   
4  Business Intelligence Analysts   

                                         NAICS2_NAME           TITLE_NAME  \
0                                       Retail Trade  Enterprise Analysts   
1  Administrative and Support and Waste Managemen...   Oracle Consultants   
2                              Finance and Insurance        Data Analysts   
3                              Finance and Insurance  Management Analysts   
4                              Unclassified Industry         Unclassified   

  REMOTE_GROUP EMPLOYMENT_GROUP MIN_YEARS_EXPERIENCE_GROUP  \
0       Onsite        Full-time                     Junior   
1       Remote        Full-time                     Junior   
2       Onsite        Full-time                  Mid-Level   
3       Onsite        Full-time                     Junior   
4       Onsite         Flexible     Internship/Entry Level   

  EDUCATION_LEVELS_CLEAN  
0    "Bachelor's degree"  
1  "No Education Listed"  
2    "Bachelor's degree"  
3  "No Education Listed"  
4  "No Education Listed"  
# Prepare binary classification data (Remote vs Onsite only)

df_binary = df_analysis[df_analysis["REMOTE_GROUP"].isin(["Remote", "Onsite"])].copy()

print(df_binary["REMOTE_GROUP"].value_counts())
#
REMOTE_GROUP
Onsite    57741
Remote    12497
Name: count, dtype: int64
# Feature Engineering - Encode categorical variables


# Define categorical and numeric columns
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

categorical_cols = [
    "EMPLOYMENT_GROUP",
    "MIN_YEARS_EXPERIENCE_GROUP",
    "EDUCATION_LEVELS_CLEAN",
    "STATE_NAME",
    "NAICS2_NAME"
]
numeric_cols = ["MIN_YEARS_EXPERIENCE", "DURATION"]

# 1. create label

label_encoder = LabelEncoder()
df_binary["label"] = label_encoder.fit_transform(df_binary["REMOTE_GROUP"])

# Take a look at the category order
print("Label mapping:", dict(zip(label_encoder.classes_,
                                 label_encoder.transform(label_encoder.classes_))))

# 2. create features(indexer + onehot + VectorAssembler)
X = df_binary[categorical_cols + numeric_cols]
y = df_binary["label"]

# ColumnTransformer 
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols),
    ]
)

X_features = preprocess.fit_transform(X)

print("--- Prepared Data Preview (pandas + sklearn) ---")
print("X_features type:", type(X_features))
print("X_features shape:", X_features.shape)
print("First 5 labels:", y.iloc[:5].tolist())
print("First 5 REMOTE_GROUP:", df_binary["REMOTE_GROUP"].iloc[:5].tolist())


if hasattr(X_features, "toarray"):
    X_dense = X_features.toarray()
else:
    X_dense = np.asarray(X_features)


df_prepared = pd.DataFrame({
    "REMOTE_GROUP": df_binary["REMOTE_GROUP"].reset_index(drop=True),
    "label": y.reset_index(drop=True),     # df_binary["label"]
    "features": list(X_dense)             # One for each line numpy array
})

print("\n--- df_prepared preview ---")
print(df_prepared.head())
print(df_prepared.shape)
Label mapping: {'Onsite': np.int64(0), 'Remote': np.int64(1)}
--- Prepared Data Preview (pandas + sklearn) ---
X_features type: <class 'scipy.sparse._csr.csr_matrix'>
X_features shape: (70238, 112)
First 5 labels: [0, 1, 0, 0, 0]
First 5 REMOTE_GROUP: ['Onsite', 'Remote', 'Onsite', 'Onsite', 'Onsite']

--- df_prepared preview ---
  REMOTE_GROUP  label                                           features
0       Onsite      0  [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
1       Remote      1  [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
2       Onsite      0  [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...
3       Onsite      0  [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
4       Onsite      0  [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
(70238, 3)
from sklearn.model_selection import train_test_split


train_data, test_data = train_test_split(
    df_prepared,
    test_size=0.3,        # 30% for test
    random_state=42,      # seed=42
)

print(f"Training set size: {len(train_data)}")
print(f"Test set size: {len(test_data)}")
Training set size: 49166
Test set size: 21072
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# break out X / y
X_train = np.stack(train_data["features"].to_numpy())   # shape: (n_train, n_features)
y_train = train_data["label"].to_numpy()

X_test  = np.stack(test_data["features"].to_numpy())
y_test  = test_data["label"].to_numpy()

# Logistic Regression
lr_model = LogisticRegression(
    max_iter=1000,      
    n_jobs=-1
)
lr_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,   # = numTrees
    random_state=42,    # = seed
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

print("Both models trained successfully!")
Both models trained successfully!

2 Predict

import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score


# First extract X_test/y_test from test_data
X_test = np.stack(test_data["features"].to_numpy())
y_test = test_data["label"].to_numpy()


# Predict

# Logistic Regression
y_pred_lr = lr_model.predict(X_test)

# Obtain the positive class probability using predict_proba and calculate the AUC
y_proba_lr = lr_model.predict_proba(X_test)[:, 1]

# Random Forest
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]


# Evaluation:Accuracy / F1 / AUC-ROC
from sklearn.metrics import f1_score

print("LOGISTIC REGRESSION RESULTS")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"F1 Score (weighted): {f1_score(y_test, y_pred_lr, average='weighted'):.4f}")
print(f"AUC-ROC:  {roc_auc_score(y_test, y_proba_lr):.4f}")

print("RANDOM FOREST RESULTS")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"F1 Score (weighted): {f1_score(y_test, y_pred_rf, average='weighted'):.4f}")
print(f"AUC-ROC:  {roc_auc_score(y_test, y_proba_rf):.4f}")
LOGISTIC REGRESSION RESULTS
Accuracy: 0.8250
F1 Score (weighted): 0.7533
AUC-ROC:  0.6368
RANDOM FOREST RESULTS
Accuracy: 0.8349
F1 Score (weighted): 0.8168
AUC-ROC:  0.7293
# Step 7: Confusion Matrix
from sklearn.metrics import confusion_matrix

print("\n=== Logistic Regression Confusion Matrix (2x2) ===")
print("(rows = true label, cols = predicted label)")

cm_lr = confusion_matrix(y_test, y_pred_lr, labels=[0, 1])
cm_lr_df = pd.DataFrame(
    cm_lr,
    index=["True 0 (Onsite)", "True 1 (Remote)"],
    columns=["Pred 0 (Onsite)", "Pred 1 (Remote)"]
)
print(cm_lr_df)

print("\n=== Random Forest Confusion Matrix (2x2) ===")
cm_rf = confusion_matrix(y_test, y_pred_rf, labels=[0, 1])
cm_rf_df = pd.DataFrame(
    cm_rf,
    index=["True 0 (Onsite)", "True 1 (Remote)"],
    columns=["Pred 0 (Onsite)", "Pred 1 (Remote)"]
)
print(cm_rf_df)

=== Logistic Regression Confusion Matrix (2x2) ===
(rows = true label, cols = predicted label)
                 Pred 0 (Onsite)  Pred 1 (Remote)
True 0 (Onsite)            17274               59
True 1 (Remote)             3628              111

=== Random Forest Confusion Matrix (2x2) ===
                 Pred 0 (Onsite)  Pred 1 (Remote)
True 0 (Onsite)            16372              961
True 1 (Remote)             2517             1222

3 confusion matrix

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
import plotly.graph_objects as go
from plotly.subplots import make_subplots


# Calculate the confusion matrix (in the order of [0, 1])
cm_lr = confusion_matrix(y_test, y_pred_lr, labels=[0, 1])
cm_rf = confusion_matrix(y_test, y_pred_rf, labels=[0, 1])

print("Logistic Regression CM:\n", cm_lr)
print("Random Forest CM:\n", cm_rf)

# The z (list of list) required for converting to plotly
lr_z = cm_lr.tolist()
rf_z = cm_rf.tolist()

lr_x = ['Onsite', 'Remote']  # Predicted label
lr_y = ['Onsite', 'Remote']  # True label

# Visual
fig = make_subplots(
    rows=1, cols=2, 
    subplot_titles=('Logistic Regression', 'Random Forest')
)

# Logistic Regression heatmap
fig.add_trace(
    go.Heatmap(
        z=lr_z,
        x=lr_x,
        y=lr_y,
        colorscale='Blues',
        text=lr_z,
        texttemplate="%{text}",
        showscale=False
    ),
    row=1, col=1
)

# Random Forest heatmap
fig.add_trace(
    go.Heatmap(
        z=rf_z,
        x=lr_x,
        y=lr_y,
        colorscale='Blues',
        text=rf_z,
        texttemplate="%{text}",
        showscale=True
    ),
    row=1, col=2
)

fig.update_layout(
    title_text="Remote vs Onsite Job Classification - Confusion Matrix Comparison",
    title_font_size=16,
    height=400,
    width=900
)

fig.update_xaxes(title_text="Predicted Label")
fig.update_yaxes(title_text="True Label")

fig.write_image("figures/confusion_matrix_comparison.jpg")
Logistic Regression CM:
 [[17274    59]
 [ 3628   111]]
Random Forest CM:
 [[16372   961]
 [ 2517  1222]]

4 KMeans Clustering using ONET as reference label

# KMeans Clustering using ONET as reference label

# Check ONET distribution first
onet_dist_df = (
    df_analysis["ONET_NAME"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "ONET_NAME", "ONET_NAME": "count"})
)

print(onet_dist_df.head(20))
                            count  count
0  Business Intelligence Analysts  72454
1                         Unknown     44

5 Prepare features for KMeans clustering

# Prepare features for KMeans clustering
# Define columns
cluster_categorical_cols = [
    "EMPLOYMENT_GROUP",
    "MIN_YEARS_EXPERIENCE_GROUP",
    "EDUCATION_LEVELS_CLEAN",
    "STATE_NAME",
    "REMOTE_GROUP",
]
cluster_numeric_cols = ["MIN_YEARS_EXPERIENCE", "DURATION"]

# Prepare the feature table X (only including the columns used for clustering)
X_cluster = df_analysis[cluster_categorical_cols + cluster_numeric_cols]

# 3. ColumnTransformer = StringIndexer + OneHotEncoder + VectorAssembler
cluster_preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cluster_categorical_cols),
        ("num", "passthrough", cluster_numeric_cols),
    ]
)

cluster_pipeline = Pipeline(
    steps=[
        ("preprocess", cluster_preprocess)
    ]
)

# Fit and merge to generate the features matrix
X_cluster_features = cluster_pipeline.fit_transform(X_cluster)

if hasattr(X_cluster_features, "toarray"):
    X_cluster_dense = X_cluster_features.toarray()
else:
    X_cluster_dense = np.asarray(X_cluster_features)

print("--- Clustering Features Shape ---")
print(X_cluster_dense.shape)

# 5. ONET_NAME -> ONET_label
onet_le = LabelEncoder()
df_analysis = df_analysis.copy()
df_analysis["ONET_NAME"] = df_analysis["ONET_NAME"].fillna("Unknown")
df_analysis["ONET_label"] = onet_le.fit_transform(df_analysis["ONET_NAME"])

print("ONET label mapping (first few):")
for name, idx in list(zip(onet_le.classes_, range(len(onet_le.classes_))))[:10]:
    print(idx, "->", name)

# 6. group df_cluster:ONET_NAME | ONET_label | features
df_cluster = df_analysis.copy()
df_cluster["features"] = list(X_cluster_dense)

print("--- Clustering Data Prepared (pandas) ---")
print(df_cluster[["ONET_NAME", "ONET_label", "features"]].head())
print(df_cluster.shape)
--- Clustering Features Shape ---
(72498, 94)
ONET label mapping (first few):
0 -> Business Intelligence Analysts
1 -> Unknown
--- Clustering Data Prepared (pandas) ---
                        ONET_NAME  ONET_label  \
0  Business Intelligence Analysts           0   
1  Business Intelligence Analysts           0   
2  Business Intelligence Analysts           0   
3  Business Intelligence Analysts           0   
4  Business Intelligence Analysts           0   

                                            features  
0  [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...  
1  [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...  
2  [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...  
3  [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...  
4  [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  
(72498, 18)

6 Find optimal K using Elbow Method

# Find optimal K using Elbow Method
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Extract the feature matrix from the df_cluster
X_cluster = np.stack(df_cluster["features"].to_numpy())   # shape: (n_samples, n_features)

k_values = [3, 5, 7, 10, 15, 20]
silhouette_scores = []

print("--- Finding Optimal K ---")
for k in k_values:
    kmeans = KMeans(
        n_clusters=k,
        random_state=42,    # seed = 42
        n_init="auto"      
    )
    labels = kmeans.fit_predict(X_cluster)

    score = silhouette_score(X_cluster, labels)
    silhouette_scores.append(score)

    print(f"K = {k}: Silhouette Score = {score:.4f}")
--- Finding Optimal K ---
K = 3: Silhouette Score = 0.5132
K = 5: Silhouette Score = 0.4411
K = 7: Silhouette Score = 0.3804
K = 10: Silhouette Score = 0.3590
K = 15: Silhouette Score = 0.3334
K = 20: Silhouette Score = 0.2926

7 Extract the feature matrix

import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Extract the feature matrix
X_cluster = np.stack(df_cluster["features"].to_numpy())  # shape: (n_samples, n_features)

# Train the final KMeans model
optimal_k = 3

kmeans_final = KMeans(
    n_clusters=optimal_k,
    random_state=42,   # seed = 42
    n_init="auto"
)

cluster_labels = kmeans_final.fit_predict(X_cluster)  

# Write the cluster back to the DataFrame
df_clustered = df_cluster.copy()
df_clustered["cluster"] = cluster_labels   

# 4. calculate silhouette
final_score = silhouette_score(X_cluster, cluster_labels)
print(f"\n--- Final KMeans Model (K={optimal_k}) ---")
print(f"Silhouette Score: {final_score:.4f}")

# Cluster Distribution
print("\n--- Cluster Distribution ---")
print(
    df_clustered["cluster"]
    .value_counts()
    .sort_index()
)

--- Final KMeans Model (K=3) ---
Silhouette Score: 0.5132

--- Cluster Distribution ---
cluster
0    19473
1    39891
2    13134
Name: count, dtype: int64

8 Analyze clusters vs ONET reference labels

# Cross-tabulation of clusters and ONET
print("--- Top ONET categories in each cluster ---")
for cluster_id in range(optimal_k):
    print(f"\n=== Cluster {cluster_id} ===")
    top_onet_df = (
        df_clustered[df_clustered["cluster"] == cluster_id]["ONET_NAME"]
        .value_counts()
        .reset_index()
        .rename(columns={"index": "ONET_NAME", "ONET_NAME": "count"})
        .head(5)
    )
    print(top_onet_df)
--- Top ONET categories in each cluster ---

=== Cluster 0 ===
                            count  count
0  Business Intelligence Analysts  19472
1                         Unknown      1

=== Cluster 1 ===
                            count  count
0  Business Intelligence Analysts  39851
1                         Unknown     40

=== Cluster 2 ===
                            count  count
0  Business Intelligence Analysts  13131
1                         Unknown      3

9 Analyze cluster characteristics

print("--- Cluster Characteristics ---")

# === Remote work distribution by cluster ===
print("\n=== Remote Work by Cluster ===")
remote_by_cluster = (
    df_clustered
    .groupby(["cluster", "REMOTE_GROUP"])
    .size()
    .reset_index(name="count")
    .sort_values(["cluster", "REMOTE_GROUP"])
)
print(remote_by_cluster)

remote_pivot = remote_by_cluster.pivot(
    index="cluster",
    columns="REMOTE_GROUP",
    values="count"
).fillna(0).astype(int)
print("\nRemote work pivot table:")
print(remote_pivot)

# === Experience level by cluster ===
print("\n=== Experience Level by Cluster ===")
exp_by_cluster = (
    df_clustered
    .groupby(["cluster", "MIN_YEARS_EXPERIENCE_GROUP"])
    .size()
    .reset_index(name="count")
    .sort_values(["cluster", "MIN_YEARS_EXPERIENCE_GROUP"])
)
print(exp_by_cluster)

exp_pivot = exp_by_cluster.pivot(
    index="cluster",
    columns="MIN_YEARS_EXPERIENCE_GROUP",
    values="count"
).fillna(0).astype(int)
print("\nExperience level pivot table:")
print(exp_pivot)

# === Top states by cluster ===
print("\n=== Top States by Cluster ===")
for cluster_id in range(optimal_k):
    print(f"\n--- Cluster {cluster_id} Top States ---")
    top_states = (
        df_clustered[df_clustered["cluster"] == cluster_id]["STATE_NAME"]
        .value_counts()
        .head(5)
    )
    print(top_states)
--- Cluster Characteristics ---

=== Remote Work by Cluster ===
   cluster REMOTE_GROUP  count
0        0       Hybrid    540
1        0       Onsite  15203
2        0       Remote   3730
3        1       Hybrid   1177
4        1       Onsite  32518
5        1       Remote   6196
6        2       Hybrid    543
7        2       Onsite  10020
8        2       Remote   2571

Remote work pivot table:
REMOTE_GROUP  Hybrid  Onsite  Remote
cluster                             
0                540   15203    3730
1               1177   32518    6196
2                543   10020    2571

=== Experience Level by Cluster ===
    cluster MIN_YEARS_EXPERIENCE_GROUP  count
0         0                     Expert    922
1         0     Internship/Entry Level   7249
2         0                     Junior   3628
3         0                  Mid-Level   3894
4         0                     Senior   3780
5         1                     Expert   1847
6         1     Internship/Entry Level  14474
7         1                     Junior   7179
8         1                  Mid-Level   7681
9         1                     Senior   8710
10        2                     Expert    705
11        2     Internship/Entry Level   4953
12        2                     Junior   2438
13        2                  Mid-Level   2348
14        2                     Senior   2690

Experience level pivot table:
MIN_YEARS_EXPERIENCE_GROUP  Expert  Internship/Entry Level  Junior  Mid-Level  \
cluster                                                                         
0                              922                    7249    3628       3894   
1                             1847                   14474    7179       7681   
2                              705                    4953    2438       2348   

MIN_YEARS_EXPERIENCE_GROUP  Senior  
cluster                             
0                             3780  
1                             8710  
2                             2690  

=== Top States by Cluster ===

--- Cluster 0 Top States ---
STATE_NAME
Texas         2098
California    2041
Florida       1069
Virginia      1000
New York       801
Name: count, dtype: int64

--- Cluster 1 Top States ---
STATE_NAME
Texas         4519
California    3839
Illinois      2417
Virginia      2082
New York      1969
Name: count, dtype: int64

--- Cluster 2 Top States ---
STATE_NAME
Texas         1450
California    1204
Florida        636
New York       571
Virginia       554
Name: count, dtype: int64

10 Visualization: Elbow Plot

import pandas as pd
import plotly.express as px

# obtain the cluster frequency
cluster_counts_series = (
    df_clustered["cluster"]
    .value_counts()
    .sort_index()
)

# Clearly create a DataFrame with only two columns, "cluster" and "count"
cluster_counts = pd.DataFrame({
    "cluster": cluster_counts_series.index,
    "count": cluster_counts_series.values
})

print(cluster_counts)  # Check if the column names are ['cluster', 'count']

# Visual
fig = px.bar(
    cluster_counts,
    x="cluster",
    y="count",
    title="KMeans Clustering: Distribution of Jobs Across Clusters",
    labels={"cluster": "Cluster", "count": "Number of Jobs"},
    template="plotly_white",
    color="count",
    color_continuous_scale="Blues",
)

fig.update_layout(font=dict(family="Roboto", size=14))

fig.write_image("figures/kmeans_elbow_plot.jpg")
fig
   cluster  count
0        0  19473
1        1  39891
2        2  13134

11 Visualization: Cluster Distribution

import pandas as pd
import plotly.express as px

# Visualization: Cluster Distribution

# Get cluster counts(
mapping = {0: 2, 1: 0, 2: 1}

df_clustered["cluster_spark"] = df_clustered["cluster"].map(mapping)

cluster_counts = (
    df_clustered["cluster_spark"]
    .value_counts()
    .sort_index()
)


fig = px.bar(
    x=cluster_counts.index,
    y=cluster_counts.values,
    color=cluster_counts.values,          
    color_continuous_scale="Blues",      
    labels={"x": "Cluster", "y": "Number of Jobs", "color": "Number of Jobs"},
    title="KMeans Clustering: Distribution of Jobs Across Clusters",
    template="plotly_white",
)

fig.update_layout(font=dict(family="Roboto", size=14))

fig.write_image("figures/kmeans_cluster_distribution.jpg")
fig

12 Choropleth Map: Remote Work Percentage by State

import pandas as pd
import numpy as np

# Calculate remote work percentage by state (pandas version)

# 1.STATE_NAME & REMOTE_GROUP
state_remote = (
    df_analysis
    .groupby(["STATE_NAME", "REMOTE_GROUP"])
    .size()
    .reset_index(name="count")
)

# 2. pivot:index = STATE_NAME,columns = REMOTE_GROUP(Onsite / Remote / Hybrid)
state_df = (
    state_remote
    .pivot(index="STATE_NAME", columns="REMOTE_GROUP", values="count")
    .fillna(0)
    .reset_index()
)

# Ensure that the "Hybrid" column exists
if "Hybrid" not in state_df.columns:
    state_df["Hybrid"] = 0

# Calculate Total and Remote_Pct
state_df["Total"] = state_df["Onsite"] + state_df["Remote"] + state_df["Hybrid"]
state_df["Remote_Pct"] = (state_df["Remote"] / state_df["Total"] * 100).round(2)

print("--- State Remote Work Data ---")
print(state_df.head(10))
--- State Remote Work Data ---
REMOTE_GROUP   STATE_NAME  Hybrid  Onsite  Remote   Total  Remote_Pct
0                 Alabama    15.0   567.0   108.0   690.0       15.65
1                  Alaska     7.0   132.0    97.0   236.0       41.10
2                 Arizona    43.0  1349.0   246.0  1638.0       15.02
3                Arkansas    12.0   464.0   108.0   584.0       18.49
4              California   262.0  5766.0  1056.0  7084.0       14.91
5                Colorado    55.0  1141.0   259.0  1455.0       17.80
6             Connecticut    41.0   660.0   162.0   863.0       18.77
7                Delaware    15.0   330.0    93.0   438.0       21.23
8                 Florida    72.0  3060.0   513.0  3645.0       14.07
9                 Georgia    64.0  2169.0   425.0  2658.0       15.99

13 Add state abbreviations for Plotly map

# State name to abbreviation mapping
state_abbrev = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
    'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
    'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
    'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
    'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
    'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
    'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
    'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
    'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY',
    'District of Columbia': 'DC'
}

# Add state abbreviation column
state_df['State_Abbrev'] = state_df['STATE_NAME'].map(state_abbrev)

# Remove rows without valid state abbreviation (e.g., "Unknown")
state_df_clean = state_df[state_df['State_Abbrev'].notna()]

print(f"States with data: {len(state_df_clean)}")
print(state_df_clean[['STATE_NAME', 'State_Abbrev', 'Total', 'Remote_Pct']].head(10))
States with data: 50
REMOTE_GROUP   STATE_NAME State_Abbrev   Total  Remote_Pct
0                 Alabama           AL   690.0       15.65
1                  Alaska           AK   236.0       41.10
2                 Arizona           AZ  1638.0       15.02
3                Arkansas           AR   584.0       18.49
4              California           CA  7084.0       14.91
5                Colorado           CO  1455.0       17.80
6             Connecticut           CT   863.0       18.77
7                Delaware           DE   438.0       21.23
8                 Florida           FL  3645.0       14.07
9                 Georgia           GA  2658.0       15.99

14 Choropleth Map with State Labels showing Remote Percentage

import plotly.graph_objects as go

# Choropleth Map with State Labels showing Remote Percentage

fig = go.Figure(data=go.Choropleth(
    locations=state_df_clean['State_Abbrev'],
    z=state_df_clean['Remote_Pct'],
    locationmode='USA-states',
    colorscale='Blues',
    colorbar_title='Remote %',
    text=state_df_clean['State_Abbrev'] + '<br>' + state_df_clean['Remote_Pct'].astype(str) + '%',
    hovertemplate='<b>%{text}</b><br>Total Jobs: %{customdata[0]}<br>Remote Jobs: %{customdata[1]}<extra></extra>',
    customdata=state_df_clean[['Total', 'Remote']].values,
    marker_line_color='white',
    marker_line_width=1
))

# Add state abbreviations with percentages as annotations
fig.add_scattergeo(
    locations=state_df_clean['State_Abbrev'],
    locationmode='USA-states',
    text=state_df_clean['Remote_Pct'].apply(lambda x: f'{x:.0f}%'),
    mode='text',
    textfont=dict(size=8, color='black', family='Arial Black'),
    showlegend=False
)

fig.update_layout(
    title_text='Remote Work Opportunity by State (% of Jobs that are Remote)',
    title_font_size=18,
    geo=dict(
        scope='usa',
        projection_type='albers usa',
        showlakes=True,
        lakecolor='rgb(255, 255, 255)',
        bgcolor='rgba(0,0,0,0)'
    ),
    template='plotly_white',
    font=dict(family="Roboto", size=14),
    height=600,
    width=1000
)

fig.write_image("figures/choropleth_remote_work_with_labels.jpg")
print("Enhanced choropleth map saved!")
fig
Enhanced choropleth map saved!

15 Choropleth Map: Total Job Postings by State (with labels)

import plotly.graph_objects as go

# Choropleth Map: Total Job Postings by State (with labels)

# Get top 15 states by total jobs for labeling
top_states = state_df_clean.nlargest(15, 'Total')['State_Abbrev'].tolist()

fig = go.Figure(data=go.Choropleth(
    locations=state_df_clean['State_Abbrev'],
    z=state_df_clean['Total'],
    locationmode='USA-states',
    colorscale='Greens',
    colorbar_title='Total Jobs',
    hovertemplate='<b>%{location}</b><br>Total Jobs: %{z:,}<br>Remote: %{customdata[0]:,}<br>Onsite: %{customdata[1]:,}<extra></extra>',
    customdata=state_df_clean[['Remote', 'Onsite']].values,
    marker_line_color='white',
    marker_line_width=1.5
))

# Add labels for top states (format large numbers with K)
top_state_df = state_df_clean[state_df_clean['State_Abbrev'].isin(top_states)].copy()
top_state_df['Total_Label'] = top_state_df['Total'].apply(
    lambda x: f'{x/1000:.1f}K' if x >= 1000 else str(int(x))
)

fig.add_scattergeo(
    locations=top_state_df['State_Abbrev'],
    locationmode='USA-states',
    text=top_state_df['Total_Label'],
    mode='text',
    textfont=dict(size=10, color='darkgreen', family='Arial Black'),
    showlegend=False
)

fig.update_layout(
    title_text='Total Job Postings by State<br><sup>Labels shown for top 15 states by job volume</sup>',
    title_font_size=16,
    geo=dict(
        scope='usa',
        projection_type='albers usa',
        showlakes=True,
        lakecolor='rgb(255, 255, 255)'
    ),
    template='plotly_white',
    font=dict(family="Roboto", size=14),
    height=600,
    width=1000
)


fig.write_image("figures/choropleth_total_jobs_with_labels.jpg")
print("Enhanced total jobs choropleth map saved!")
fig
Enhanced total jobs choropleth map saved!

16 Bar Chart: Top 10 States by Remote Work Percentage

# Filter states with at least 100 jobs for meaningful comparison
state_df_filtered = state_df_clean[state_df_clean['Total'] >= 100]

# Sort by remote percentage
top_remote_states = state_df_filtered.nlargest(10, 'Remote_Pct')

fig = px.bar(
    top_remote_states,
    x='STATE_NAME',
    y='Remote_Pct',
    color='Remote_Pct',
    color_continuous_scale='Blues',
    title='Top 10 States with Highest Remote Work Opportunities',
    labels={'STATE_NAME': 'State', 'Remote_Pct': 'Remote Work %'},
    text='Remote_Pct'
)

fig.update_traces(texttemplate='%{text:.1f}%', textposition='outside')
fig.update_layout(
    template='plotly_white',
    font=dict(family="Roboto", size=14),
    xaxis_tickangle=-45,
    showlegend=False
)

fig.write_image("figures/top10_remote_states.jpg")
print("Top 10 remote states bar chart saved!")
fig
Top 10 remote states bar chart saved!